library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.1
## Warning: package 'lubridate' was built under R version 4.3.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lavaan)
## This is lavaan 0.6-16
## lavaan is FREE software! Please report any bugs.
library(semTools)
##  
## ###############################################################################
## This is semTools 0.5-6
## All users of R (or SEM) are invited to submit functions or ideas for functions.
## ###############################################################################
## 
## Attaching package: 'semTools'
## 
## The following object is masked from 'package:readr':
## 
##     clipboard
library(plotly)
## Warning: package 'plotly' was built under R version 4.3.1
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
holdout <- arrow::read_feather("ignore.data-holdout-set-item-similarity-20230710-164559")
llm_holdout_meta <- arrow::read_feather("ignore.llmdata-holdout-set-item-similarity-20230710-164559.feather")
holdout_real <- holdout %>% 
  select(ItemStemIdA, ItemStemIdB, Pearson) %>% 
  left_join(llm_holdout_meta %>% select(ItemStemIdA = ItemStemId, VariableA = Variable, InstrumentA = instrument, ScaleA = scale_0, SubscaleA = scale_1)) %>% 
  left_join(llm_holdout_meta %>% select(ItemStemIdB = ItemStemId, VariableB = Variable, InstrumentB = instrument, ScaleB = scale_0, SubscaleB = scale_1))
## Joining with `by = join_by(ItemStemIdA)`
## Joining with `by = join_by(ItemStemIdB)`
holdout_llm <- holdout %>% 
  select(ItemStemIdA, ItemStemIdB, CosineSimilarity) %>% 
  left_join(llm_holdout_meta %>% select(ItemStemIdA = ItemStemId, VariableA = Variable, InstrumentA = instrument, ScaleA = scale_0, SubscaleA = scale_1)) %>% 
  left_join(llm_holdout_meta %>% select(ItemStemIdB = ItemStemId, VariableB = Variable, InstrumentB = instrument, ScaleB = scale_0, SubscaleB = scale_1))
## Joining with `by = join_by(ItemStemIdA)`
## Joining with `by = join_by(ItemStemIdB)`
cors_llm <- holdout_llm %>% 
  drop_na(InstrumentA, ScaleA, InstrumentB, ScaleB) %>% 
  select(x = VariableA, y = VariableB, r = CosineSimilarity) %>% 
  as.data.frame() |> 
  igraph::graph_from_data_frame(directed = FALSE) |> 
  igraph::as_adjacency_matrix(attr = "r", sparse = FALSE)
diag(cors_llm) <- 1

cors_real <- holdout_real %>% 
  drop_na(InstrumentA, ScaleA, InstrumentB, ScaleB) %>% 
  select(x = VariableA, y = VariableB, r = Pearson) %>% 
  as.data.frame() |> 
  igraph::graph_from_data_frame(directed = FALSE) |> 
  igraph::as_adjacency_matrix(attr = "r", sparse = FALSE)
diag(cors_real) <- 1

scales <- llm_holdout_meta %>% 
  drop_na(instrument, scale_0) %>% 
  mutate(scale = str_replace_all(paste(instrument, scale_0), "[^a-zA-Z_0-9]", "_")) %>% 
  group_by(scale) %>% 
  summarise(
    items = list(Variable),
    lvn = paste(first(scale), " =~ ", paste(Variable, collapse = " + "))) %>% 
  drop_na()

random_scales <- list()
for(i in 1:25) {
  n_items <- rpois(1, 10)
  random_scales[[i]] <- llm_holdout_meta %>% 
    drop_na(instrument, scale_0) %>% 
    sample_n(n_items) %>%  
    mutate(scale = paste0("random", i)) %>% 
  group_by(scale) %>% 
  summarise(
    items = list(Variable),
    lvn = paste(first(scale), " =~ ", paste(Variable, collapse = " + "))) %>% 
  drop_na()
}
random_scales <- bind_rows(random_scales)
scales <- bind_rows(scales, random_scales)

find_reverse_items <- function(rs) {
  # Calculate the mean correlation for each item, excluding the item's correlation with itself.
  mean_correlations <- apply(rs, 1, function(x) mean(x[-which(x == 1)]))

  # Identify items with negative mean correlation
  # You may adjust the threshold according to your specific criteria.
  threshold <- -0.01
  reverse_keyed_items <- names(which(mean_correlations < threshold))

  # Now you know which items are likely to be reverse-coded.  
  reverse_keyed_items
}

find_reverse_items_by_first_item <- function(rs) {
  # negatively correlated with first item
  items <- rs[-1, 1]
  reverse_keyed_items <- names(items)[which(items < 0)]
  reverse_keyed_items
}

reverse_items <- function(rs, reverse_keyed_items) {
  # Reverse the correlations for the reverse-keyed items
  for (item in reverse_keyed_items) {
    # Get the index of the reverse-keyed item
    item_index <- which(rownames(rs) == item)
    
    # Reverse the correlations
    rs[item_index, ] <- rs[item_index, ] * -1
    rs[, item_index] <- rs[, item_index] * -1
    
    # Since the diagonal is the correlation of the item with itself, set it back to 1
    rs[item_index, item_index] <- 1
  }
  rs
}


scales <- scales %>% 
  rowwise() %>% 
  mutate(r_real = list(cors_real[items, items]),
         r_llm = list(cors_llm[items, items])) %>% 
  mutate(reverse_items = list(find_reverse_items_by_first_item(r_real)),
         r_real_rev = list(reverse_items(r_real, reverse_items)),
         r_llm_rev = list(reverse_items(r_llm, reverse_items))) %>% 
  mutate(
    cfa_real = list(cfa(lvn, sample.cov = r_real_rev, sample.nobs = 1000)),
    rel_real = semTools::compRelSEM(cfa_real)) %>%
  mutate(
    cfa_llm = list(cfa(lvn, sample.cov = r_llm_rev, sample.nobs = 1000)),
    rel_llm = semTools::compRelSEM(cfa_llm))
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `cfa_llm = list(cfa(lvn, sample.cov = r_llm_rev, sample.nobs =
##   1000))`.
## ℹ In row 27.
## Caused by warning in `lav_object_post_check()`:
## ! lavaan WARNING: some estimated ov variances are negative
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
scales <- scales %>% filter(between(as.vector(rel_llm), 0, 1)) %>% filter(between(as.vector(rel_real), 0, 1))
cor.test(scales$rel_llm, scales$rel_real)
## 
##  Pearson's product-moment correlation
## 
## data:  scales$rel_llm and scales$rel_real
## t = 9.0572, df = 48, p-value = 5.844e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6623552 0.8784135
## sample estimates:
##       cor 
## 0.7942677
scales %>% filter(scale == "LOT_Optimism") %>% pull(cfa_real) %>% .[[1]] %>% summary
## lavaan 0.6.16 ended normally after 33 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        12
## 
##   Number of observations                          1000
## 
## Model Test User Model:
##                                                       
##   Test statistic                                67.031
##   Degrees of freedom                                 9
##   P-value (Chi-square)                           0.000
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                                 Expected
##   Information saturated (h1) model          Structured
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)
##   LOT_Optimism =~                                     
##     opt_1             1.000                           
##     opt_2             1.370    0.152    8.995    0.000
##     opt_3             1.269    0.146    8.716    0.000
##     opt_4             1.656    0.173    9.581    0.000
##     opt_5             1.758    0.181    9.717    0.000
##     opt_6             1.461    0.159    9.213    0.000
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)
##    .opt_1             0.854    0.041   20.977    0.000
##    .opt_2             0.728    0.038   19.275    0.000
##    .opt_3             0.766    0.039   19.854    0.000
##    .opt_4             0.603    0.036   16.844    0.000
##    .opt_5             0.552    0.035   15.583    0.000
##    .opt_6             0.691    0.037   18.649    0.000
##     LOT_Optimism      0.145    0.027    5.334    0.000
scales %>% 
  mutate(rel_real = round(rel_real, 2)) %>% 
  mutate(rel_llm = round(rel_llm, 2)) %>% 
ggplot(., aes(rel_real, rel_llm, label = scale)) + 
  geom_abline(linetype = "dashed") +
  geom_point() +
  coord_fixed(xlim = c(0,1), ylim = c(0,1)) -> p

ggplotly(p)